## Replication of numbers in text, as well as figure 1, 9, and 10 ##
rm(list = ls())
library(tidyverse)
library(igraph)
library(ggraph)
library(data.table)
library(gridExtra)

cenl2 <- read_csv("nltrial_clean2.csv")
cenl <- read_csv("nltrial_clean.csv")
ceelall <- read_csv("eltrial_clean.csv")

### replication of numbers mentioned in text: ####

# abstract and Results: 
# number of experts 
dim(cenl)

# number of nominations
dim(ceelall)

# response rate:
table(!is.na(cenl$dateresponse)) # this includes individuals who didn't fill out the survey, but answered via e-mail, mostly to decline filling out the survey:
table(cenl$responded, useNA="always")
sum(cenl$responded)/2107 # 2107 is the number of individuals for which we have found contact details

# differential response rates:
t.test(responded ~ gender=="f", data=cenl)
t.test(responded ~ typeinstsimp=="academic", data=cenl) # note that this figure may be off because of introduction of 9 NAs to protect respondents' anonymity
t.test(responded ~ georegion=="China", data=cenl)

# composition of field:
table(cenl$gender, useNA="always")
table(cenl$gender, useNA="always")/length(cenl$gender)

# women more likely to be nominated as "dark horses" (type==1)
t.test(type==1 ~ gender, data= left_join(ceelall, cenl, by=c("target" = "ID")))
dt <- left_join(ceelall, cenl, by=c("target" = "ID")) %>% filter(type==1) %>% group_by(gender) %>% summarise(n=n())
dt$n[dt$gender=="f"]/sum(dt$n)

table(cenl$typeinstsimp, useNA="always")/length(cenl$typeinstsimp) # note that this figure may be off because of introduction of 9 NAs to protect respondents' anonymity
table(cenl$georegion, useNA="always")/length(cenl$georegion)

### Figure 1 ###
dt <- cenl %>% group_by(georegion, responded) %>% summarise(count=n()) 
dt2 <- dt %>% group_by(georegion) %>% summarise(total=sum(count))
dt <- left_join(dt, dt2) 

dt$label <- NA
dt$label[!dt$responded] <- dt$total[!dt$responded]
dt$label[dt$responded] <- paste(round(100*dt$count[dt$responded]/dt$total[dt$responded], digits=0), "%", sep="")
dt$label[dt$responded & dt$georegion %in% c("North America (other)", "South America", "Africa", "MiddleEast")] <- ""

(p1 <-  dt %>% ggplot(aes(x = reorder(georegion, -count), y = count, fill = responded)) + 
  geom_bar(stat = "identity", position = "stack") +
  geom_text(aes(label=label), vjust=0) +
  ggtitle("Geographical region") + 
  theme_bw()+
  theme(legend.position="none") +
  scale_fill_manual(values=c("#999999", "#555555")) +
  scale_x_discrete(guide = guide_axis(n.dodge = 2)) +
  xlab("") + 
  ylab(""))


dt <- cenl %>% group_by(typeinstsimp, responded) %>% summarise(count=n()) 
dt2 <- dt %>% group_by(typeinstsimp) %>% summarise(total=sum(count))
dt <- left_join(dt, dt2) 

dt$label <- NA
dt$label[!dt$responded] <- dt$total[!dt$responded]
dt$label[dt$responded] <- paste(round(100*dt$count[dt$responded]/dt$total[dt$responded], digits=0), "%", sep="")
dt$label[dt$responded & dt$typeinstsimp %in% c("IO", "independent", "government", "NGO")] <- ""

(p2 <- dt %>%
  ggplot(aes(x = reorder(typeinstsimp, -count), y = count, fill = responded)) + 
  geom_bar(stat = "identity", position = "stack") +
  geom_text(aes(label=label), vjust=0) +
  ggtitle("Institutional work affiliation") + 
  theme_bw() +
  theme(legend.position="none")+
  scale_fill_manual(values=c("#999999", "#555555")) +
  scale_x_discrete(guide = guide_axis(n.dodge = 2)) +
  xlab("") + 
  ylab(""))

dt <- cenl %>% group_by(gender, responded) %>% summarise(count=n()) 
dt2 <- dt %>% group_by(gender) %>% summarise(total=sum(count))
dt <- left_join(dt, dt2) 

dt$label <- NA
dt$label[!dt$responded] <- dt$total[!dt$responded]
dt$label[dt$responded] <- paste(round(100*dt$count[dt$responded]/dt$total[dt$responded], digits=0), "%", sep="")
dt$gender[is.na(dt$gender)] <- "NA"
dt$gender[dt$gender == "m"] <- "male"
dt$gender[dt$gender == "f"] <- "female"


(p3 <- dt %>%
  ggplot(aes(x = reorder(gender, -count), y = count, fill = responded)) + 
  geom_bar(stat = "identity", position = "stack") +
  geom_text(aes(label=label), vjust=0) +
  ggtitle("Gender") + 
  theme_bw() +
  #theme(axis.text.x = element_text(angle = 30))+
  scale_fill_manual(values=c("#999999", "#555555")) +
  scale_x_discrete(guide = guide_axis(n.dodge = 2)) +
  xlab("") + 
  ylab(""))
grid.arrange(p1, p2, p3, ncol=3)

grob <- arrangeGrob(p1, p2, p3, ncol=3, widths = c(1,1,.75))

# percentage of experts who went to unique university:

cenl2$UGduplicated <- duplicated(cenl2$UGUni, fromLast = T) | duplicated(cenl2$UGUni, fromLast = F)
table(sum(!(cenl2$UGduplicated))/sum(!is.na(cenl2$UGUni)))

cenl2$PhDduplicated <- duplicated(cenl2$PhDUni, fromLast = T) | duplicated(cenl2$PhDUni, fromLast = F)
table(sum(!(cenl2$PhDduplicated))/sum(!is.na(cenl2$PhDUni)))

# share of China experts where we do have age information: #
table(sum(!is.na(cenl2$cohort))/length(cenl2$cohort))

##### Figure 9: Same for top 20 China Watchers #####

table(cenl$indegree, useNA="always") # so the top20 (21) are anyone with indegree above 33

cenl$top <- cenl$indegree>33
table(cenl$top, useNA="always")

dt <- cenl %>% filter(top) %>% group_by(georegion) %>% summarise(count=n()) 
dt$label <- paste(round(100*dt$count/sum(dt$count), digits=0), "%", sep="")

p1 <-  dt %>% ggplot(aes(x = reorder(georegion, -count), y = count)) + 
  geom_bar(stat = "identity") +
  geom_text(aes(label=label),vjust=-0.25) +
  ggtitle("Geographical region") + 
  theme_bw()+
  theme(legend.position="none") +
  scale_fill_manual(values=c("#999999")) +
  scale_x_discrete(guide = guide_axis(n.dodge = 2)) +
  xlab("") + 
  ylab("")

dt <- cenl %>% filter(top) %>% group_by(typeinstsimp) %>% summarise(count=n()) 
dt$label <- paste(round(100*dt$count/sum(dt$count), digits=0), "%", sep="")

p2 <- dt %>%
  ggplot(aes(x = reorder(typeinstsimp, -count), y = count)) + 
  geom_bar(stat = "identity") +
  geom_text(aes(label=label),vjust=-0.25) +
  ggtitle("Institutional work affiliation") + 
  theme_bw() +
  theme(legend.position="none")+
  scale_fill_manual(values=c("#999999")) +
  scale_x_discrete(guide = guide_axis(n.dodge = 2)) +
  xlab("") + 
  ylab("")

dt <- cenl %>% filter(top) %>% group_by(gender) %>% summarise(count=n()) 
dt$label <- paste(round(100*dt$count/sum(dt$count), digits=0), "%", sep="")

p3 <- dt %>%
  ggplot(aes(x = reorder(gender, -count), y = count)) + 
  geom_bar(stat = "identity") +
  geom_text(aes(label=label),vjust=-0.25) +
  ggtitle("Gender") + 
  theme_bw() +
  #theme(axis.text.x = element_text(angle = 30))+
  scale_fill_manual(values=c("#999999")) +
  scale_x_discrete(guide = guide_axis(n.dodge = 2)) +
  xlab("") + 
  ylab("") + scale_x_discrete(labels=c("m" = " \nmale", "f" = " \nfemale",
                                       "NA" = " \nNA"))
grid.arrange(p1, p2, p3, ncol=3)
grob <- arrangeGrob(p1, p2, p3, ncol=3, widths = c(1,1,.75))


##### Figure 10: Same for top 100 China Watchers #####

table(cenl$indegree, useNA="always") # so the top 100 (102) are anyone with indegree abvoe 11?

cenl$top <- cenl$indegree>11
table(cenl$top, useNA="always")


dt <- cenl %>% filter(top) %>% group_by(georegion) %>% summarise(count=n()) 
dt$label <- paste(round(100*dt$count/sum(dt$count), digits=0), "%", sep="")

p1 <-  dt %>% ggplot(aes(x = reorder(georegion, -count), y = count)) + 
  geom_bar(stat = "identity") +
  geom_text(aes(label=label),vjust=-0.25) +
  ggtitle("Geographical region") + 
  theme_bw()+
  theme(legend.position="none") +
  scale_fill_manual(values=c("#999999")) +
  scale_x_discrete(guide = guide_axis(n.dodge = 2)) +
  xlab("") + 
  ylab("")

dt <- cenl %>% filter(top) %>% group_by(typeinstsimp) %>% summarise(count=n()) 
dt$label <- paste(round(100*dt$count/sum(dt$count), digits=0), "%", sep="")

p2 <- dt %>%
  ggplot(aes(x = reorder(typeinstsimp, -count), y = count)) + 
  geom_bar(stat = "identity") +
  geom_text(aes(label=label),vjust=-0.25) +
  ggtitle("Institutional work affiliation") + 
  theme_bw() +
  theme(legend.position="none")+
  scale_fill_manual(values=c("#999999")) +
  scale_x_discrete(guide = guide_axis(n.dodge = 2)) +
  xlab("") + 
  ylab("")

dt <- cenl %>% filter(top) %>% group_by(gender) %>% summarise(count=n()) 
dt$label <- paste(round(100*dt$count/sum(dt$count), digits=0), "%", sep="")

p3 <- dt %>%
  ggplot(aes(x = reorder(gender, -count), y = count)) + 
  geom_bar(stat = "identity") +
  geom_text(aes(label=label),vjust=-0.25) +
  ggtitle("Gender") + 
  theme_bw() +
  #theme(axis.text.x = element_text(angle = 30))+
  scale_fill_manual(values=c("#999999")) +
  scale_x_discrete(guide = guide_axis(n.dodge = 2)) +
  xlab("") + 
  ylab("") + scale_x_discrete(labels=c("m" = " \nmale", "f" = " \nfemale",
                                       "NA" = " \nNA"))
grid.arrange(p1, p2, p3, ncol=3)
grob <- arrangeGrob(p1, p2, p3, ncol=3, widths = c(1,1,.75))

